Analysis of single-cell RNA-Seq data of Human spematogenesis (GEO: GSE109037 and GSE120508)

Author: Prabhakaran Munusamy

In [1]:
%config InlineBackend.figure_formats = ["retina"]
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import scanpy as sc
from scipy import stats
from  more_itertools import unique_everseen
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.n_jobs = 16
scanpy==1.4.7.dev82+ge1cd0d8.d20200805 anndata==0.7.1 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==1.0.3 scikit-learn==0.22.1 statsmodels==0.11.0rc2 python-igraph==0.7.1 leidenalg==0.7.0
In [2]:
import anndata2ri
anndata2ri.activate()
In [3]:
%reload_ext rpy2.ipython
In [4]:
plt.rcParams.update({"font.family":"Reem Kufi"})
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
In [5]:
heatcolors = matplotlib.colors.LinearSegmentedColormap.from_list("", ["gray", "#000004FF", "#330A5FFF", "#781C6DFF",
                                                                      "#BB3754FF", "#ED6925FF", 
                                                                      "#FCB519FF", "#FCFFA4FF"])

heatcolors_wr = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "#FFF5F0", "#FEE0D2", "#FCBBA1",
                                                                         "#FC9272", "#FB6A4A", "#EF3B2C", 
                                                                         "#CB181D", "#A50F15", "#67000D"])
In [6]:
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
In [7]:
%%R
suppressWarnings(suppressMessages(require(scran)))
suppressWarnings(suppressMessages(require(scater)))
suppressWarnings(suppressMessages(library(DropletUtils)))
suppressWarnings(suppressMessages(library(batchelor)))
options(mc.cores = 24L)
In [8]:
%%R
# Load the individual data
## Guo Cairns data
SRR6860519 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860519/outs/filtered_feature_bc_matrix/")
colData(SRR6860519)$Library <- rep("SRR6860519", ncol(SRR6860519))
colData(SRR6860519)$Sample <- rep("Donor1", ncol(SRR6860519))
SRR6860520 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860520/outs/filtered_feature_bc_matrix/")
colData(SRR6860520)$Library <- rep("SRR6860520", ncol(SRR6860520))
colData(SRR6860520)$Sample <- rep("Donor1", ncol(SRR6860520))
SRR6860521 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860521/outs/filtered_feature_bc_matrix/")
colData(SRR6860521)$Library <- rep("SRR6860521", ncol(SRR6860521))
colData(SRR6860521)$Sample <- rep("Donor2", ncol(SRR6860521))
SRR6860522 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860522/outs/filtered_feature_bc_matrix/", )
colData(SRR6860522)$Library <- rep("SRR6860522", ncol(SRR6860522))
colData(SRR6860522)$Sample <- rep("Donor2", ncol(SRR6860522))
SRR6860523 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860523/outs/filtered_feature_bc_matrix/", )
colData(SRR6860523)$Library <- rep("SRR6860523", ncol(SRR6860523))
colData(SRR6860523)$Sample <- rep("Donor3", ncol(SRR6860523))
SRR6860524 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Guo_Cairns/SRR6860524/outs/filtered_feature_bc_matrix/", )
colData(SRR6860524)$Library <- rep("SRR6860524", ncol(SRR6860524))
colData(SRR6860524)$Sample <- rep("Donor3", ncol(SRR6860524))
## Brian Hermann data
Adult3 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Hermann_McCarrey/AdultHuman_17-3/outs/filtered_feature_bc_matrix", )
colData(Adult3)$Library <- rep("Adult3", ncol(Adult3))
colData(Adult3)$Sample <- rep("Adult3", ncol(Adult3))
Adult4 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Hermann_McCarrey/AdultHuman_17-4/outs/filtered_feature_bc_matrix", )
colData(Adult4)$Library <- rep("Adult4", ncol(Adult4))
colData(Adult4)$Sample <- rep("Adult4", ncol(Adult4))
Adult5 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Human_Spermatogenesis/Hermann_McCarrey/AdultHuman_17-5/outs/filtered_feature_bc_matrix", )
colData(Adult5)$Library <- rep("Adult5", ncol(Adult5))
colData(Adult5)$Sample <- rep("Adult5", ncol(Adult5))
In [9]:
%%R -o hs_sce
# Merge all datasets
hs_sce <- cbind(SRR6860519, SRR6860520, SRR6860521, SRR6860522, SRR6860523, SRR6860524, Adult3, Adult4, Adult5)
rownames(hs_sce) <- rowData(hs_sce)$Symbol
print(hs_sce)
class: SingleCellExperiment 
dim: 33538 42938 
metadata(9): Samples Samples ... Samples Samples
assays(1): counts
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(3): ID Symbol Type
colnames: NULL
colData names(3): Sample Barcode Library
reducedDimNames(0):
spikeNames(0):
altExpNames(0):
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [10]:
%%R
bcrank <- barcodeRanks(counts(hs_sce))

# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
    xlab="Rank", ylab="Total UMI count", cex.lab=1.2)

abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)

legend("bottomleft", legend=c("Knee"), 
        col=c("dodgerblue"), lty=2, cex=1.2)
In [11]:
%%R
MT.Genes <- grep("^MT-", rownames(hs_sce), value = T)
print(MT.Genes)
 [1] "MT-ND1"  "MT-ND2"  "MT-CO1"  "MT-CO2"  "MT-ATP8" "MT-ATP6" "MT-CO3" 
 [8] "MT-ND3"  "MT-ND4L" "MT-ND4"  "MT-ND5"  "MT-ND6"  "MT-CYB" 
In [12]:
%%R
# Calculate QC metrics
hs_sce <- addPerCellQC(hs_sce, subsets = list(mt=MT.Genes))

#Remove low quality cells i.e., those with less than 200 genes expressed and less than 3500 UMI counts
hs_sce <- hs_sce[,hs_sce$detected > 200 & hs_sce$sum > 3500 & hs_sce$subsets_mt_percent < 20]

# Remove unxpressed genes 
hs_sce <- hs_sce[Matrix::rowSums(counts(hs_sce)) > 0, ]
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [13]:
%%R
print(hs_sce)
class: SingleCellExperiment 
dim: 31363 15962 
metadata(9): Samples Samples ... Samples Samples
assays(1): counts
rownames(31363): MIR1302-2HG OR4F5 ... AC213203.1 FAM231C
rowData names(3): ID Symbol Type
colnames: NULL
colData names(13): Sample Barcode ... subsets_mt_percent total
reducedDimNames(0):
spikeNames(0):
altExpNames(0):
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [14]:
%%R
# Normalization
set.seed(55063)
clusters <- quickCluster(hs_sce)
hs_sce <- computeSumFactors(hs_sce, cluster = clusters)
hs_sce <- logNormCounts(hs_sce)

set.seed(55063)
mod_mf <- modelGeneVar(hs_sce)
hvgs_mf <- getTopHVGs(mod_mf, prop = 0.1)

length(hvgs_mf)
[1] 1503
In [15]:
%%R
# Dimensionality reduction
set.seed(55063)
hs_sce <- runPCA(hs_sce, subset_row = hvgs_mf)
hs_sce <- runTSNE(hs_sce, dimred = "PCA", num_threads = 24, verbose = F)
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [16]:
%%R
# Perform Batch correction
# Identify HVGs based on sample
set.seed(55063)
mod_hs_batch <- modelGeneVar(hs_sce, block = colData(hs_sce)$Library)
hvgs_hs_batch <- getTopHVGs(mod_hs_batch, prop = 0.1)

set.seed(55063)
hs_sce_corrected <- fastMNN(hs_sce, subset.row = hvgs_hs_batch, batch=hs_sce$Library)

set.seed(55063)
hs_sce_corrected <- runTSNE(hs_sce_corrected, dimred = "corrected", perplexity=300, num_threads = 12, verbose = F)
reducedDim(hs_sce, "mnn") <- reducedDim(hs_sce_corrected, "corrected")
reducedDim(hs_sce, "mnnTSNE") <- reducedDim(hs_sce_corrected, "TSNE")
In [17]:
%%R -o hs_sce
length(hvgs_hs_batch)
[1] 1387
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [18]:
hs_sce.obs.index = hs_sce.obs['Barcode'].astype('str')+'-'+hs_sce.obs['Sample'].astype('str')
In [19]:
hs_sce.var_names_make_unique()
In [20]:
hs_sce.obs['Sample'].value_counts()
Out[20]:
Adult5    3383
Adult4    2858
Donor2    2845
Donor3    2607
Adult3    2168
Donor1    2101
Name: Sample, dtype: int64
In [21]:
hs_sce.obs['sum'].describe()
Out[21]:
count     15962.000000
mean      14654.270705
std       15343.561765
min        3502.000000
25%        5696.250000
50%        9383.500000
75%       17307.750000
max      212085.000000
Name: sum, dtype: float64
In [22]:
hs_sce.obs['detected'].describe()
Out[22]:
count    15962.000000
mean      3568.814935
std       1866.437132
min        214.000000
25%       2030.250000
50%       3233.000000
75%       4783.000000
max      12059.000000
Name: detected, dtype: float64
In [23]:
sc.pl.tsne(hs_sce, color = ["Sample"], cmap=heatcolors, size = 8, edgecolor = "black", linewidths = 0.1,
           ncols = 2, legend_fontsize = 8, title="Before batch correction")
... storing 'Sample' as categorical
... storing 'Barcode' as categorical
... storing 'Library' as categorical
... storing 'Symbol' as categorical
... storing 'Type' as categorical
In [24]:
hs_sce.obsm['X_tsne'] = hs_sce.obsm['mnnTSNE']
In [25]:
sc.pl.tsne(hs_sce, color = ["Sample"], cmap=heatcolors,
           size = 8, 
           edgecolor = "black", 
           linewidths = 0.1,
           ncols = 2, 
           legend_fontsize = 8, title="After batch correction")
In [26]:
hs_sce.obsm["X_pca"] = hs_sce.obsm["mnn"]
In [27]:
sc.pp.neighbors(hs_sce, n_neighbors = 5)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)
In [28]:
sc.tl.leiden(hs_sce, resolution = 1.5, key_added = 'leiden_1.5')
running Leiden clustering
    finished: found 42 clusters and added
    'leiden_1.5', the cluster labels (adata.obs, categorical) (0:00:02)
In [29]:
sc.pl.tsne(hs_sce, color=["leiden_1.5"], 
           size=8, 
           legend_loc = "on data", 
           edgecolor = "black", 
           linewidths = 0.1,
           legend_fontsize = 8, 
           alpha=0.2)
In [30]:
wv = sc.pl.stacked_violin(hs_sce, var_names=["DDX4", "VIM",  "ID4", "REC8", "STRA8", "AURKA", "ACRV1",
                                             "PRM2", "AMH", "CD74", "VWF", "ACTA2", "TAGLN", "CLU", "DCN", 
                                             "INSL3", "INHBA", "SOX9", "DLK1", "IGF1", "IGF2", "CFD", "SFRP1", "NR2F2"],
                    groupby="leiden_1.5", swap_axes=True, layer="logcounts")
In [31]:
hs_leiden_clusters = hs_sce.obs['leiden_1.5']
In [32]:
%%R -i hs_leiden_clusters
hs_sce$hs_leiden_clusters = hs_leiden_clusters
hs_sce$broadcluster <- as.character(hs_sce$hs_leiden_clusters)
In [33]:
%%R
hs_sce$broadcluster[hs_sce$broadcluster %in% c("14", "16", "0", "38", "12", "40", "41")] = "Somatic"

hs_sce$broadcluster[hs_sce$broadcluster %in% c("35","36", "37",  "39", "28", "29", "33", "34")] = "Outliers"

hs_sce$broadcluster[!hs_sce$broadcluster %in% c("Outliers", "Somatic")] = "Germ"

Human germ and somatic cells analysis

In [34]:
%%R -o hs_sce_gs
# Seperate Germ and somatic cells
hs_sce_gs <- hs_sce[,hs_sce$broadcluster %in% c("Germ", "Somatic")]
hs_sce_gs <- hs_sce_gs[Matrix::rowSums(counts(hs_sce_gs)) > 0, ]
assay(hs_sce_gs, "normcounts") <- as(((2^logcounts(hs_sce_gs))-1), "dgCMatrix")
hs_sce_gs$hs_leiden_clusters <- as.character(hs_sce_gs$hs_leiden_clusters)
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [35]:
hs_sce_gs.obs.index = hs_sce_gs.obs['Barcode'].astype('str')+'-'+hs_sce_gs.obs['Sample'].astype('str')
In [36]:
hs_sce_gs.var_names_make_unique()
In [37]:
hs_sce_gs.obsm['X_tsne']=hs_sce_gs.obsm['mnnTSNE']
In [38]:
gsconditions = [
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['12', '40', '41'])),#Myoid
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['0'])),#Immature Leydig
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['38'])),#Sertoli
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['16'])),#Macrophage
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['14'])),#Endothelial
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['23', '13', '7', '19', '4'])),#Spermatogonia
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['8', '6', '18', '20', '31', '32', '17', '9', '26', '27', '21'])),#Spermatocytes
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['10', '15', '22', '24', '25'])),#Round spermatids
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['5', '1', '3', '2'])),#Elongating Spermatids
    (hs_sce_gs.obs["hs_leiden_clusters"].isin(['11', '30']))]#Sperm
gsgroups = ["Myoid", "Immature Leydig", "Sertoli", "Macrophage", "Endothelial", "Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"]
In [39]:
gs_clusters = np.select(gsconditions, gsgroups, default="Germ")
In [40]:
hs_sce_gs.obs["broad_clusters"] = gs_clusters
In [41]:
hs_sce_gs.obs["broad_clusters"] = hs_sce_gs.obs["broad_clusters"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Sertoli", "Macrophage", "Endothelial", 
                                                                                                             "Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"],ordered=True)
In [42]:
hs_sce_gs.obs['hs_leiden_clusters_celltype'] = hs_sce_gs.obs['hs_leiden_clusters'].astype(str) +"-"+hs_sce_gs.obs['broad_clusters'].astype(str)
In [43]:
cols_clusters = ["#FFB292","gold",  "#936C00","lime", "red", "#F16913",  "#4C7C5E", "#2171B5","#eaa9bd", "#91357d"]#"lightslategray",
In [44]:
sc.pl.tsne(hs_sce_gs, color=["broad_clusters"], legend_fontsize=10, palette=cols_clusters, 
           size=20, edgecolor="black", linewidths=0.1, title="Broad cell types")
... storing 'Sample' as categorical
... storing 'Barcode' as categorical
... storing 'Library' as categorical
... storing 'hs_leiden_clusters' as categorical
... storing 'broadcluster' as categorical
... storing 'hs_leiden_clusters_celltype' as categorical
... storing 'Symbol' as categorical
... storing 'Type' as categorical
In [45]:
sc.tl.dendrogram(hs_sce_gs, groupby="hs_leiden_clusters_celltype")
    using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_hs_leiden_clusters_celltype']`
In [46]:
sc.pl.correlation_matrix(hs_sce_gs, groupby="hs_leiden_clusters_celltype", figsize=(8,6))
In [47]:
sc.tl.rank_genes_groups(hs_sce_gs, "broad_clusters", n_genes = 6000, method = "wilcoxon", layer = "logcounts", use_raw = False)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:48)
In [48]:
gslist = {}
for i in hs_sce_gs.obs["broad_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(hs_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25, 
                                        key="rank_genes_groups").sort_values(by=["logfoldchanges"], ascending=[False]).dropna(axis=0)["names"].to_list()
    gslist[i] = genes
In [49]:
for key, value in gslist.items():
    #print value
    print(key, len([item for item in value if item]))
Myoid 2037
Immature Leydig 1499
Sertoli 648
Macrophage 1535
Endothelial 2049
Spermatogonia 3382
Spermatocytes 3404
Round spermatids 3624
Elongating spermatids 3192
Sperm 1243
In [50]:
gs_L1 = []
for i in hs_sce_gs.obs["broad_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(hs_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25,
                                        key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    gs_L1.extend(genes)
In [51]:
gs_unique_genes = list(unique_everseen(gs_L1))
len(gs_unique_genes)
Out[51]:
14487
In [52]:
mat = sc.pl.matrixplot(hs_sce_gs, gs_unique_genes, groupby="broad_clusters", figsize=(4, 4),standard_scale="var", 
                       linewidth=.000001,swap_axes=True, cmap=heatcolors_wr, dendrogram=False, layer="logcounts",
                       show=False, show_gene_labels=False)

Human germ cell analysis

In [53]:
%%R -o hs_sce_germ
# Seperate Germ and somatic cells
## GERM
hs_sce_germ <- hs_sce[,hs_sce$broadcluster=="Germ"]
hs_sce_germ <- hs_sce_germ[Matrix::rowSums(counts(hs_sce_germ)) > 0, ]
hs_sce_germ$hs_leiden_clusters <- factor(hs_sce_germ$hs_leiden_clusters)

# Identify HVGs based on sample
set.seed(55063)
mod_hs_batch_germ <- modelGeneVar(hs_sce_germ, block=colData(hs_sce_germ)$Library)
hvgs_hs_batch_germ <- getTopHVGs(mod_hs_batch_germ, prop=0.1)
print(length(hvgs_hs_batch_germ))

set.seed(55063)
hs_sce_germ_corrected <- fastMNN(hs_sce_germ, subset.row=hvgs_hs_batch_germ, batch=hs_sce_germ$Library)
reducedDim(hs_sce_germ, "mnn") <- reducedDim(hs_sce_germ_corrected, "corrected")
hs_sce_germ$hs_leiden_clusters <- as.character(hs_sce_germ$hs_leiden_clusters)
[1] 1315
Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [54]:
hs_sce_germ.obs.index = hs_sce_germ.obs['Barcode'].astype('str')+'-'+hs_sce_germ.obs['Sample'].astype('str')
In [55]:
hs_sce_germ.var_names_make_unique()
In [56]:
hs_sce_germ.obsm['X_tsne'] = hs_sce_germ.obsm['mnnTSNE']
In [57]:
sc.pl.tsne(hs_sce_germ, color=["UTF1", "ID4","FGFR3", "SOHLH1", "UCHL1","DMRT1", "KIT", "STRA8", "REC8",
                              "PRDM9","SPO11","DMC1","MEIOB", "BRCA2","ATR","SYCP1","SMC1B", "SMC3","PIWIL1",
                               "PIWIL2", "RAD51", "SYCP2","SYCP3","HORMAD1","HORMAD2", "AURKA", "PLK1",
                              "NME5","SPACA1", "ACRV1", "AKAP14","TNP2","PRM2", "CRISP2", "TPPP2", "OAZ3"],
           legend_fontsize=8, color_map=heatcolors, size=20, edgecolor="black", linewidths=0.1, wspace=0.1,
           hspace=0.2, ncols=4, layer="logcounts")
... storing 'Sample' as categorical
... storing 'Barcode' as categorical
... storing 'Library' as categorical
... storing 'hs_leiden_clusters' as categorical
... storing 'broadcluster' as categorical
... storing 'Symbol' as categorical
... storing 'Type' as categorical
In [58]:
hs_sce_germ.obsm["X_pca"] = hs_sce_germ.obsm["mnn"]
In [59]:
hs_sce_germ.obs["hs_leiden_clusters"].cat.reorder_categories(['23', '13', '7', '19', '4', '8', '6', '18', '20', '31',
                                                              '32', '17', '9', '26', '27', '21', '10', '15', '22', '24',
                                                              '25', '5', '1', '3', '2', '11', '30'], inplace = True)
In [60]:
germADconditions = [
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["23"])),#Undiff1
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["13"])),#Undiff2
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["7"])),#E-diff
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["19"])),#Diff1
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["4"])),#Diff2
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["8"])),#pre-Lep
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["6"])),#Lep1
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["18"])),#Lep2
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["20"])),#Zyg
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["31"])),#P1
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["32"])),#P2
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["17"])),#P3
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["9"])),#P4
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["26"])),#P5
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["27"])),#P6
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["21"])),#Dip & Sec. spermatocytes
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["10"])),#RS1
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["15"])),#RS2
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["22"])),#RS3
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["24"])),#RS4
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["25"])),#RS5
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["5"])),#ES1
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["1"])),#ES2
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["3"])),#ES3
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["2"])),#ES4
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["11"])),#Sperm1
    (hs_sce_germ.obs["hs_leiden_clusters"].isin(["30"]))]#Sperm2
germADgroups = ["Undiff1", "Undiff2", "E-diff", "Diff1", "Diff2", "pre-Lep", "Lep1", "Lep2", "Zyg", "Pach1","Pach2","Pach3","Pach4","Pach5","Pach6", "Dip & Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","ES1","ES2","ES3","ES4","Sperm1", "Sperm2"]
In [61]:
hs_leiden_clusters = np.select(germADconditions, germADgroups, default="Germ")
In [62]:
hs_sce_germ.obs["germAD_clusters"] = hs_leiden_clusters
In [63]:
hs_sce_germ.obs["germAD_clusters"] = hs_sce_germ.obs["germAD_clusters"].astype("category").cat.reorder_categories(["Undiff1", "Undiff2", "E-diff", "Diff1", "Diff2", "pre-Lep", "Lep1", "Lep2", "Zyg", "Pach1","Pach2","Pach3","Pach4","Pach5","Pach6",
                                                                                                                   "Dip & Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","ES1","ES2","ES3","ES4","Sperm1", "Sperm2"],ordered=True)
In [64]:
cols_clusters = ["#FEEDDE", "#FDBE85", "#FD8D3C", "#E6550D", "#A63603",#Human
                 "#F7FCF5", "#DEE9DF", "#C5D7C9", "#ACC4B3", "#94B29D", "#7BA088", "#628D72", "#4A7B5C", "#316846", "#185630", "#00441B",
                 "#EFF3FF", "#BDD7E7", "#6BAED6", "#3182BD", "#08519C",
                 "#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
In [65]:
sc.pl.tsne(hs_sce_germ, color=["germAD_clusters"], size=10, palette=cols_clusters,
            edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
In [66]:
hs_sce_gs.obs['Annotated'] = hs_sce_gs.obs['broad_clusters']
hs_sce_gs.obs['Annotated'] = hs_sce_gs.obs['Annotated'].astype('str')

Visualize the annotated germ and somatic celltypes integrated on tSNE

In [67]:
for idx, x in hs_sce_germ.obs['germAD_clusters'].iteritems():
    hs_sce_gs.obs.at[idx, 'Annotated'] = x 
In [68]:
hs_sce_gs.obs["Annotated"] = hs_sce_gs.obs["Annotated"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Sertoli", "Macrophage", "Endothelial", "Undiff1", "Undiff2", "E-diff", "Diff1", "Diff2",
                                                                                                   "pre-Lep", "Lep1", "Lep2", "Zyg", "Pach1","Pach2","Pach3","Pach4","Pach5","Pach6",
                                                                                                   "Dip & Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","ES1","ES2","ES3","ES4","Sperm1", "Sperm2"],ordered=True)
In [69]:
palette = ["#FFB292","gold",  "#936C00","lime", "red",
           "#FEEDDE", "#FDBE85", "#FD8D3C", "#E6550D", "#A63603",#Human
                 "#F7FCF5", "#DEE9DF", "#C5D7C9", "#ACC4B3", "#94B29D", "#7BA088", "#628D72", "#4A7B5C", "#316846", "#185630", "#00441B",
                 "#EFF3FF", "#BDD7E7", "#6BAED6", "#3182BD", "#08519C",
                 "#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
In [70]:
sc.pl.tsne(hs_sce_gs, color=["Annotated"], size=10, palette=palette,
            edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
In [71]:
hs_sce_germ.obs['Annotated'] = hs_sce_germ.obs['germAD_clusters']
In [72]:
sc.pp.neighbors(hs_sce_germ, n_neighbors=5) # PAGA - Partition-based graph abstraction
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
In [73]:
sc.tl.paga(hs_sce_germ, groups="germAD_clusters") # PAGA - Partition-based graph abstraction for germ cells
running PAGA
    finished: added
    'paga/connectivities', connectivities adjacency (adata.uns)
    'paga/connectivities_tree', connectivities subtree (adata.uns) (0:00:00)
In [74]:
sc.pl.paga(hs_sce_germ, color="germAD_clusters", threshold=0.15, fontsize=8, fontoutline=1, frameon=False)
--> added 'pos', the PAGA positions (adata.uns['paga'])
In [75]:
sc.tl.rank_genes_groups(hs_sce_germ, "germAD_clusters", n_genes=6000, method="wilcoxon", layer="logcounts", use_raw=False)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:57)
In [76]:
germlist = {}
for i in hs_sce_germ.obs["germAD_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(hs_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25, 
                                        key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    germlist[i] = genes
In [77]:
germlist_L1 = []
for i in hs_sce_germ.obs["germAD_clusters"].cat.categories:
    genes = sc.get.rank_genes_groups_df(hs_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25, 
                                        key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    germlist_L1.extend(genes)
In [78]:
# Obtain unique number of differentially expressed genes
germ_unique_genes = list(unique_everseen(germlist_L1))
len(germ_unique_genes)
Out[78]:
17405
In [79]:
ax = sc.pl.matrixplot(hs_sce_germ, germlist_L1, groupby="germAD_clusters", figsize=(10,6),standard_scale="var", 
                      linewidth=.0000001, swap_axes=True, cmap=heatcolors_wr, dendrogram=False, layer="logcounts", 
                      show_gene_labels=False)
In [80]:
hs_sce_germ.X = hs_sce_germ.layers["logcounts"]
In [81]:
# Load gene annnotation file
allgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Human/Human_genes_annotation_gtf.csv")
allgenes.index = allgenes['gene_name'].tolist()
allgenes = allgenes[['gene_id', 'seqname']]
allgenes.columns = ['GeneStableID', 'Chromosome']
In [82]:
allgenes = allgenes[allgenes['Chromosome'].isin(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20',
                                                 '21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y'])]
In [83]:
for ch in allgenes['Chromosome'].astype("category").cat.categories.tolist():
    genes = allgenes["Chromosome"]==ch
    Agenes = set(allgenes[genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
    hs_sce_germ.obs[ch] = np.ravel((hs_sce_germ[:, list(Agenes)].X != 0).sum(axis=1))
In [84]:
AllChr_df = pd.DataFrame()
for column in ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 
               '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y']:
    x = hs_sce_germ.obs[column]
    AllChr_df[column] = x
    
AllChr_df['germAD_clusters'] = hs_sce_germ.obs['germAD_clusters']
AllChr_df_melted = AllChr_df.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
In [85]:
AllChr_df_melted_mean = AllChr_df_melted.groupby(["germAD_clusters", "key"], as_index=False).mean()
#Reshape the data
AllChr_df_mean_reshaped = AllChr_df_melted_mean.pivot(index="key",columns="germAD_clusters")
In [86]:
AllChr_df_melted_std = AllChr_df_melted.groupby(["germAD_clusters", "key"]).std().reset_index()
AllChr_df_melted_std_reshaped = AllChr_df_melted_std.pivot(index="key",columns="germAD_clusters")
In [87]:
plt.rcParams["figure.figsize"]=6,3
cols = ["#87CEEB", "#83C8EA", "#7FC3E9", "#7BBEE9", "#78B8E8", "#74B3E8", 
"#70AEE7", "#6DA8E7", "#69A3E6", "#659EE6", "#6298E5", "#5E93E5", 
"#5A8EE4", "#5788E4", "#5383E3", "#4F7EE3", "#4C78E2", "#4873E2", 
"#446EE1", "#4169E1", "#446EE1", "#4169E1", "darkorange"]
i=0
for c in ['1', '2','3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16',
          '17', '18', '19', '20', '21', '22', 'Y']:
    plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(), 
             AllChr_df_mean_reshaped.loc[c], linewidth=0.75, color=cols[i], alpha=1, label=c)
    i+=1

    
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(), 
             AllChr_df_mean_reshaped.iloc[21], linewidth=1.5, marker='', color= "#C70039", label="X")


# Add legend
plt.legend(loc="center", ncol=2, fontsize=6, bbox_to_anchor=(1.06, 0.58), frameon=False)
# Add titles
#plt.title("Mean no. of genes", loc='left', fontsize=12, fontweight=2, color='black')
plt.ylabel("Mean no. of genes", fontsize=8)
plt.xlabel("")
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor")
plt.tick_params(axis="both",  which="minor", left=False)
plt.tick_params(axis="both",  which="major",  bottom=True, top=False,  left=True, labelbottom=True, pad=0.5, length=1.5, labelsize=8)
plt.yscale('log', basey=10)
plt.tight_layout()
plt.margins(0.003,0.003)
plt.grid(linestyle='-', linewidth='0.1')
In [88]:
# Retrive chromosomes X, Y, 9 and autosomal genes for X to A ratio
chrX_genes = allgenes["Chromosome"]=="X"
chrY_genes = allgenes["Chromosome"]=="Y"
chr9_genes = allgenes["Chromosome"]=="9"
chrA_genes = allgenes[~allgenes.Chromosome.isin(["X","Y", "MT"])]
In [89]:
# Retrieve the X and Y genes specifically present in the germ cells alone
germ_Xgenes = set(allgenes[chrX_genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
germ_Ygenes = set(allgenes[chrY_genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
germ_9genes = set(allgenes[chr9_genes].index.tolist()).intersection(hs_sce_germ.var_names.tolist())
germ_Agenes = set(chrA_genes.index.tolist()).intersection(hs_sce_germ.var_names.tolist())
In [90]:
hs_sce_germ.obs["Chr X"] = np.mean(
    hs_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.mean(hs_sce_germ[:, list(germ_Agenes)].X, axis=1).A1

hs_sce_germ.obs["Chr 9"] = np.mean(
    hs_sce_germ[:, list(germ_9genes)].X, axis=1).A1 / np.mean(hs_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
In [91]:
germXA_data = hs_sce_germ.obs["germAD_clusters"].to_frame().join(hs_sce_germ.obs["Chr 9"]).join(hs_sce_germ.obs["Chr X"])#.join(hs_sce_germ.obs["Chr Y"])
#Melt the data to long shape
germXA_data_melted = germXA_data.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
In [92]:
from scipy import stats
In [93]:
stats.mannwhitneyu(germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Diff2'])) & (germXA_data_melted['key'].isin(['Chr X']))].value,
              germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Zyg'])) & (germXA_data_melted['key'].isin(['Chr X']))].value)
Out[93]:
MannwhitneyuResult(statistic=5788.0, pvalue=7.355838754164686e-138)
In [94]:
from matplotlib.ticker import FormatStrFormatter

plt.rcParams["figure.figsize"]=6,3
bx = sns.violinplot(x="germAD_clusters", y="value",
            hue="key", palette=["#0D8DB3","#C70039"],fliersize=0.1,linewidth=0.5,split=True,scale="count",inner="quartile",
            data=germXA_data_melted)
bx.set(xlabel="")
bx.set_ylabel("Chromosome:Autosome ratio", fontsize=8)
bx.set_xticklabels(bx.get_xticklabels(), rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor", visible=True)
bx.grid(linestyle='-', linewidth='0.1')
bx.tick_params(
    axis="both",          # changes apply to the x-axis
    which="major",      # both major and minor ticks are affected
    bottom=True,      # ticks along the bottom edge are off
    top=False,  left=True,       # ticks along the top edge are off
    labelbottom=True,labeltop=False, pad=0.5, length=1.5)
bx.axhline(y=0.5,color='gray',linestyle='--')
bx.axhline(y=1,color='gray',linestyle='--')
plt.legend(loc="upper right", fontsize=6, frameon=False)
plt.tight_layout()
In [95]:
hs_sce_germ.obs["ChrX percent"] = (np.sum(
    hs_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.sum(hs_sce_germ.X, axis=1).A1)*100

hs_sce_germ.obs["ChrY percent"] = (np.sum(
    hs_sce_germ[:, list(germ_Ygenes)].X, axis=1).A1 / np.sum(hs_sce_germ.X, axis=1).A1)*100
In [96]:
#Make a dataframe using selected columns
germ_dist_data = hs_sce_germ.obs['germAD_clusters'].to_frame().join(hs_sce_germ.obs['detected']).join(hs_sce_germ.obs['sum']).join(hs_sce_germ.obs['subsets_mt_percent']).join(hs_sce_germ.obs['ChrX percent']).join(hs_sce_germ.obs['ChrY percent'])
germ_dist_data.columns = ['germAD_clusters','No. of genes', 'UMI Count', '% Mito', '% ChrX', '% ChrY']
#Melt the data to long shape
germ_dist_data_melted = germ_dist_data.melt(id_vars='germAD_clusters', var_name='key', value_name='value')
In [97]:
g = sns.FacetGrid(germ_dist_data_melted, col="key", height=6, sharex=False, hue="germAD_clusters", aspect=.5, palette=cols_clusters) 
g.map(sns.violinplot, "value", "germAD_clusters", label='xxlarge', linewidth=0.1).set_titles("{col_name}").set_axis_labels("","")
Out[97]:
<seaborn.axisgrid.FacetGrid at 0x2aadc3ff4fd0>

Pseudotime analysis

In [98]:
sc.pp.neighbors(hs_sce_germ, n_pcs=50, knn=False, method="gauss")# Find the neighbors
sc.tl.diffmap(hs_sce_germ)
computing neighbors
WARNING: Using high n_obs without `knn=True` takes a lot of memory...
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:17)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:17)
    eigenvalues of transition matrix
    [1.         0.9992454  0.99703485 0.9925455  0.9874981  0.9848782
     0.9756717  0.9705063  0.9602541  0.9532474  0.94517154 0.93815446
     0.92647946 0.91501564 0.90497625]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:27)
In [99]:
hs_sce_germ.uns["iroot"] = np.flatnonzero(hs_sce_germ.obs["germAD_clusters"]  == "Undiff1")[0] # Set the root cell
sc.tl.dpt(hs_sce_germ)
computing Diffusion Pseudotime using n_dcs=10
    finished: added
    'dpt_pseudotime', the pseudotime (adata.obs) (0:00:00)
In [107]:
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
In [108]:
sc.pl.tsne(hs_sce_germ, color=["dpt_pseudotime"], legend_fontsize=8, cmap= "viridis",
           size=20, edgecolor="black", linewidths=0.05) #Use gauss method to find neighbors to get proper pseudotime

Analysis of human sex chromosome genes in germ cells

In [109]:
# Load chr X gene annnotation file
Xgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Human/Human_Xgenes_Annotated_updated.csv")
In [110]:
Xgenes.index = Xgenes['gene_name'].tolist()
In [111]:
Xgenes = Xgenes[~Xgenes.index.duplicated(keep='last')]
In [112]:
Xgenes['GeneClass'].value_counts()
Out[112]:
Single-copy    860
Multicopy      152
Ampliconic      65
Name: GeneClass, dtype: int64
In [113]:
Xgenes[['Ancestral', 'Shared']] = Xgenes[['Ancestral','Shared']].fillna(value="NotShared")
In [114]:
# Subset data based on different classes of X chromosome genes
germXsingle_sce = hs_sce_germ[:,hs_sce_germ.var['Symbol'][hs_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Single-copy"])].tolist()]
germXsingle_sce.var_names_make_unique()
germXsingle_sce.obs_names_make_unique()
germXMulti_sce = hs_sce_germ[:,hs_sce_germ.var['Symbol'][hs_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Multicopy"])].tolist()]
germXMulti_sce.var_names_make_unique()
germXMulti_sce.obs_names_make_unique()
germXAmpliconic_sce = hs_sce_germ[:,hs_sce_germ.var['Symbol'][hs_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Ampliconic"])].tolist()]
germXAmpliconic_sce.var_names_make_unique()
germXAmpliconic_sce.obs_names_make_unique()
germY_sce = hs_sce_germ[:,list(germ_Ygenes)]
germY_sce.var_names_make_unique()
germY_sce.obs_names_make_unique()


sc.tl.rank_genes_groups(germXsingle_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXMulti_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXAmpliconic_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germY_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);


singleX_L1 = []
for i in germXsingle_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germXsingle_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    singleX_L1.extend(genes)
    
MultiX_L1 = []
for i in germXMulti_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germXMulti_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    MultiX_L1.extend(genes)
    
Ampli_L1 = []
for i in germXAmpliconic_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germXAmpliconic_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    Ampli_L1.extend(genes)
    
singleY_L1 = []
for i in germY_sce.obs["Annotated"].cat.categories:
    genes = sc.get.rank_genes_groups_df(germY_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
    singleY_L1.extend(genes)

singleX_unique = pd.Series(singleX_L1).drop_duplicates().tolist()
MultiX_unique = pd.Series(MultiX_L1).drop_duplicates().tolist()
Ampli_unique = pd.Series(Ampli_L1).drop_duplicates().tolist()
singleY_unique = pd.Series(singleY_L1).drop_duplicates().tolist()

print("No. of single X copy genes: %d" %len(singleX_unique))
print("No. of multi X copy genes: %d" %len(MultiX_unique))
print("No. of ampliconic X genes: %d" %len(Ampli_unique))
print("No. of Y genes: %d" %len(singleY_unique))
ranking genes
Trying to set attribute `.uns` of view, copying.
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:02)
ranking genes
Trying to set attribute `.var` of view, copying.
... storing 'ID' as categorical
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
ranking genes
Trying to set attribute `.uns` of view, copying.
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
ranking genes
Trying to set attribute `.uns` of view, copying.
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
No. of single X copy genes: 804
No. of multi X copy genes: 132
No. of ampliconic X genes: 50
No. of Y genes: 70
In [115]:
XYgenes = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique, 'Y':singleY_unique}
In [116]:
X_sma = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique}

X chromsome genes (Single, multi-copy and ampliconic) gene expression

In [117]:
ax = sc.pl.matrixplot(hs_sce_germ, X_sma, groupby="Annotated", figsize=(8,8),standard_scale="var", linewidth=.001,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)

Multi-copy and Ampliconic gene expression

In [118]:
ax = sc.pl.matrixplot(hs_sce_germ, {'Multi':MultiX_unique, 'Ampli':Ampli_unique}, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.01,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
In [119]:
Xgenes_hs = Xgenes[Xgenes['gene_name'].isin(hs_sce_germ.var['Symbol'])]
In [120]:
Xgenes_hs['GeneClass'].value_counts()
Out[120]:
Single-copy    804
Multicopy      131
Ampliconic      50
Name: GeneClass, dtype: int64
In [121]:
pd.crosstab(Xgenes_hs['GeneClass'], Xgenes_hs['Shared'])
Out[121]:
Shared NotShared Shared_hs_mf Shared_hs_mm Shared_hs_mm_mf
GeneClass
Ampliconic 5 27 5 13
Multicopy 10 41 22 58
Single-copy 213 54 35 502
In [122]:
MultiX_unique_shared = {}
for i in Xgenes_hs['Shared'].astype('category').cat.categories:
    genes = [x for x in MultiX_unique if x in Xgenes_hs['gene_name'][(Xgenes_hs['GeneClass']=='Multicopy') & (Xgenes_hs['Shared']==i)].tolist()]
    MultiX_unique_shared[i] = genes

Multi-copy gene expression along with its conservation type

In [123]:
ax = sc.pl.matrixplot(hs_sce_germ, MultiX_unique_shared, groupby="Annotated", figsize=(10,7),standard_scale="var", linewidth=.01,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
In [124]:
AmpliX_unique_shared = {}
for i in Xgenes_hs['Shared'].astype('category').cat.categories:
    genes = [x for x in Ampli_unique if x in Xgenes_hs['gene_name'][(Xgenes_hs['GeneClass']=='Ampliconic') & (Xgenes_hs['Shared']==i)].tolist()]
    AmpliX_unique_shared[i] = genes

Ampliconic gene expression along with its conservation type

In [125]:
ax = sc.pl.matrixplot(hs_sce_germ, AmpliX_unique_shared, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.01,swap_axes=True,
                 cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)

Y chromosome gene expression as dotplot

In [126]:
sc.pl.dotplot(hs_sce_germ,singleY_unique, groupby="Annotated", figsize=(10,6), standard_scale="var",color_map=heatcolors_wr,
                 dendrogram=False, layer="logcounts")
Out[126]:
GridSpec(2, 5, height_ratios=[0, 10.5], width_ratios=[9.05, 0, 0.2, 0.5, 0.25])

Analysis of ortholog and species-specific gene expression in germ cells

In [127]:
hs_unique = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/17Apr_rev/Hs_unique.txt")
In [128]:
Orthologs = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/Human_Mouse_Monkey_orthologs_21Feb_clean.txt", sep=" ")

Orthologs = Orthologs[(Orthologs.Crab_eating_macaque_homology_type=="ortholog_one2one") & (Orthologs.Mouse_homology_type=="ortholog_one2one")]
In [129]:
hs_ortho_df = pd.DataFrame(columns=["1:1:1 Othologs", "Human-specific", "Other orthologs"])
In [ ]:
for i in hs_sce_germ.obs['germAD_clusters'].astype('category').cat.categories.tolist():
    df = i+"_df"
    df = hs_sce_germ[hs_sce_germ.obs['germAD_clusters'].isin([i]),:];
    sc.pp.filter_genes(df, min_cells=round((df.n_obs*0.05)));
    hs_ortho_df.loc[i] = [len(df.var['ID'][df.var['ID'].isin(Orthologs.Gene_stable_ID.tolist())])/len(df.var['ID'])*100,
                          len(df.var['ID'][df.var['ID'].isin(hs_unique['x'].tolist())])/len(df.var['ID'])*100,
                          len(df.var['ID'][~df.var['ID'].isin(Orthologs.Gene_stable_ID.tolist()+hs_unique['x'].tolist())])/len(df.var['ID'])*100]
In [131]:
hs_ortho_df['Groups'] = hs_ortho_df.index
In [132]:
plt.rcParams["figure.figsize"]=3,10
hs_ortho_df_melted = hs_ortho_df.melt('Groups', var_name='Type',  value_name='value')
g = sns.factorplot(x="Groups", y="value", hue='Type', data=hs_ortho_df_melted, kind="point",
                   palette = ["#DF8F44FF", "#AD4A38", "#80796BFF"], height=5, aspect=1.3, scale=0.6)
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor");
plt.xlabel("");
plt.ylabel("% Genes expressed", fontsize=10);